# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
dataset = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
dataset
#Display the First Five Rows
dataset.head()
#Display the last Five Rows
dataset.tail()
#Check the Shape(Rows,Columns)
dataset.shape
#Check any null Value in Dataset
dataset.isnull().sum()
#Display the Statistical Information
dataset.describe()
Avg. age of Customer is 38.
Customer that visits the mall is having the age b/w 18 to 70.
Having the Avg. annual income as $60k.
#Plot for Age
plt.figure(figsize=(15,15))
sns.countplot(data=dataset, x='Age',palette='gist_rainbow_r')
Most of the customer were have age between 28-30, we can also look at the histogram for a better conclusion over the group of ages.
Let's look how many of them are male and female...
#Plot for Gender
plt.figure(figsize=(8,8))
dataset.Gender.value_counts().plot(kind='pie', autopct='%.2f%%', shadow=True,
explode=(0,0.04))
plt.legend()
No. of females > No. of males.
Females are 12% more than Male.
Female visits the mall most of the time.
Let us look Income Distribution...
#Plot for Income
plt.figure(figsize=(20,7))
sns.countplot(dataset['Annual Income (k$)'], data=dataset,palette='gist_stern')
plt.title('Distribution of Anuual Income')
The customers that are having the salary of 54(k)and 78(K), visits the mall most of time
We can compare their income ranges using histgrams.
_,(ax0, ax1) = plt.subplots(1, 2, figsize=(25, 10))
#plot for Age distribution
sns.histplot(data=dataset, x='Age', hue='Gender', binwidth=9, multiple='stack', ax=ax0,palette='Pastel1').set_title('Age ditribution')
#plot for Income distribution
sns.histplot(data=dataset, x='Annual Income (k$)', hue='Gender', binwidth=11, multiple='stack', ax=ax1,palette='Dark2').set_title('Income distribution')
plt.show()
20-30 & 30-40 are the most common age group customers.
The biggest cusomer age group is 30 year-old customers and those are mostly women.
Most of customers earn between \$50k - \$80k.
Very few people earns more than $120k.
Let's look at the spending scores according to ages
#Implot
sns.lmplot(data=dataset,x='Spending Score (1-100)', y='Age', hue='Gender')
There is a linear relation between Spending Score and Age.
Young customers tend to have more spending score and spending score decreases when age goes up.
Let's compare their annual income according to gender for different ages.
#Volin Plot
plt.figure(figsize=(25,8))
ax = sns.violinplot(x="Age", y="Annual Income (k$)", hue="Gender",data=dataset, palette="afmhot", split=True,scale="count", inner="quartile")
At the teenage , the mens were having more annual income but as the age increases the annual income of women increases , and again at the later ages above 55 the annual income of mens are more.
the K-means algorithm identifies k number of centroids, and then allocates every data point to the nearest cluster, while keeping the centroids as small as possible.The ‘means’ in the K-means refers to averaging of the data,that is finding the centroid.
We find the inertia for different values of clusters. Inertia is the sum of squared distances of samples to their closest cluster center and we plot a curve for inertia vs number of cluster, and we chose k at the “elbow” point from the curve i.e. the point after which the inertia start decreasing in a linear fashion.
X = dataset.iloc[:, [3, 4]].values
from sklearn.cluster import KMeans
wcss = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++',max_iter = 500, n_init = 15)
kmeans.fit(X)
wcss.append(kmeans.inertia_)
plt.plot(range(1, 11), wcss, marker = 'o')
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.grid()
plt.show()
From above we can see the elbow point is at k=5 ,after which the curve almost become linear. This shows that we can define customer in 5 different categories.
kmeans = KMeans(n_clusters = 5, init = 'k-means++', random_state = 42)
y_kmeans = kmeans.fit_predict(X)
plt.figure(figsize=(8,8))
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_kmeans == 3, 0], X[y_kmeans == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_kmeans == 4, 0], X[y_kmeans == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s = 300, c = 'yellow', label = 'Centroids')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.grid()
plt.show()
Drawback of K-means:
Difficult to predict K-Value.
With global cluster, it didn't work well.
Different initial partitions can result in different final clusters.
To address these problem we use Heirarical clustering.
In this type of clustering we do not define initial random clusters,instead we find the pair of cluster according to the points distances and group them, we this till all the points are grouped, thus its an bottom up approach.It’s also known as AGNES (Agglomerative Nesting).
import scipy.cluster.hierarchy as sch
plt.figure(figsize=(8,8))
dendrogram = sch.dendrogram(sch.linkage(X, method = 'ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean distances')
plt.show()
from sklearn.cluster import AgglomerativeClustering
hc = AgglomerativeClustering(n_clusters = 5, affinity = 'euclidean', linkage = 'ward')
y_hc = hc.fit_predict(X)
plt.scatter(X[y_hc == 0, 0], X[y_hc == 0, 1], s = 100, c = 'red', label = 'Cluster 1')
plt.scatter(X[y_hc == 1, 0], X[y_hc == 1, 1], s = 100, c = 'blue', label = 'Cluster 2')
plt.scatter(X[y_hc == 2, 0], X[y_hc == 2, 1], s = 100, c = 'green', label = 'Cluster 3')
plt.scatter(X[y_hc == 3, 0], X[y_hc == 3, 1], s = 100, c = 'cyan', label = 'Cluster 4')
plt.scatter(X[y_hc == 4, 0], X[y_hc == 4, 1], s = 100, c = 'magenta', label = 'Cluster 5')
plt.title('Clusters of customers')
plt.xlabel('Annual Income (k$)')
plt.ylabel('Spending Score (1-100)')
plt.legend()
plt.show()
Avg of the customer where from age group of 38 and all kind of cutomers where their having income range from 15-137($k).
Most of the customer were women , so we need to look after the products that womens mostly focuses on.
Most of the cutomer where having average income between \$50k - \$80k , these are some of the special customers for marketer because they might give more profit.
People with annual less than 25k and greater the 70k $ shows more spending scores , marketers need to make another strategies for them as they most regular persons.
People having age lesser than 30 have the avg spending score higher than income score while it got decrease the age increases ,this shows that they are saving or not spending much as compared to their income.
🙂THANK YOU FOR SEEING THIS NOTEBOOK🙂